First, Installing all the packagesrequired

# get the require R packages
# install.packages("ggplot2")
# install.packages("plyr")
# install.packages("dplyr")
# install.packages("caret")
# install.packages("moments")
# install.packages("glmnet")
# install.packages("elasticnet")
# install.packages("knitr")
# install.packages("Metrics")

Loaded the packages installed through R

# get the require R packages
library(ggplot2)
library(plyr)
library(dplyr)
library(caret)
library(moments)
library(glmnet)
library(elasticnet)
library(knitr)
library(Metrics)
#read in all.rds(output from pre-processing.Rmd)
train=readRDS("train.rds")
test=readRDS("test.rds")
print(train)
print(test)
#train:test -> 80%:20%
train_train=train[1:(0.8*dim(train)[1]),]
test_train=train[(0.8*dim(train)[1]+1):(dim(train)[1]),]
print(train_train)
print(test_train)

Divide out the train and test dataset according to our initial proportion

# create data for training and test
X_train <- train_train[,-230]
X_test <- test_train[,-230]
y <- train_train[,230]

Now, model of regression is being set up.

#MODEL
# set up caret model training parameters
# model specific training parameter
CARET.TRAIN.CTRL <- trainControl(method="repeatedcv",
                                 number=5,
                                 repeats=5,
                                 verboseIter=FALSE)

Tesing it in few model 1) Ridge Regression (suitable for multicollinearity variable/ each variable have high interaction)

# test out Ridge regression model
lambdas <- seq(1,0,-0.001)
# train model
set.seed(123)  # for reproducibility
model_ridge <- train(x=X_train,y=y,
                     method="glmnet",
                     metric="RMSE",
                     maximize=FALSE,
                     trControl=CARET.TRAIN.CTRL,
                     tuneGrid=expand.grid(alpha=0, # Ridge regression
                                          lambda=lambdas))

Now we calculated RMSE to test on How accurate is Ridge regression with varying value of K or lambda

#RMSE-Root Mean Square Error
ggplot(data=filter(model_ridge$result,RMSE<0.14)) +
  geom_line(aes(x=lambda,y=RMSE))

mean(model_ridge$resample$RMSE)
[1] 0.1284926
  1. Used Lasso regression method
# test out Lasso regression model
# train model
set.seed(123)  # for reproducibility
model_lasso <- train(x=X_train,y=y,
                     method="glmnet",
                     metric="RMSE",
                     maximize=FALSE,
                     trControl=CARET.TRAIN.CTRL,
                     tuneGrid=expand.grid(alpha=1,  # Lasso regression
                                          lambda=c(1,0.1,0.05,0.01,seq(0.009,0.001,-0.001),
                                                   0.00075,0.0005,0.0001)))
There were missing values in resampled performance measures.
model_lasso
glmnet 

1168 samples
 229 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 934, 935, 934, 935, 934, 935, ... 
Resampling results across tuning parameters:

  lambda   RMSE       Rsquared   MAE       
  0.00010  0.1338182  0.8902503  0.08856091
  0.00050  0.1292176  0.8970363  0.08533420
  0.00075  0.1281772  0.8984861  0.08447431
  0.00100  0.1275513  0.8993626  0.08387739
  0.00200  0.1257195  0.9019860  0.08295205
  0.00300  0.1247320  0.9034516  0.08294093
  0.00400  0.1244038  0.9040196  0.08316878
  0.00500  0.1245662  0.9038936  0.08352907
  0.00600  0.1250857  0.9032637  0.08408178
  0.00700  0.1259577  0.9020940  0.08490289
  0.00800  0.1270268  0.9006113  0.08585199
  0.00900  0.1281812  0.8989975  0.08685273
  0.01000  0.1292382  0.8975404  0.08775829
  0.05000  0.1600044  0.8653493  0.11104276
  0.10000  0.1977531  0.8426895  0.14172801
  1.00000  0.4011399        NaN  0.31142884

Tuning parameter 'alpha' was held constant at a value of 1
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 0.004.
mean(model_lasso$resample$RMSE)
[1] 0.1244038

From previous prediction, lambda 0.004 give best Rsquare score which explain almost 90% of the variables We used this to build up our final model

# extract coefficients for the best performing model
coef <- data.frame(coef.name = dimnames(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda))[[1]], 
                   coef.value = matrix(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda)))
# exclude the (Intercept) term
coef <- coef[-1,]
# print summary of model results
picked_features <- nrow(filter(coef,coef.value!=0))
not_picked_features <- nrow(filter(coef,coef.value==0))
cat("Lasso picked",picked_features,"variables and eliminated the other",
    not_picked_features,"variables\n")
Lasso picked 84 variables and eliminated the other 145 variables

Plotting all coefficient relationship with the price variable, applying the model on data

# sort coefficients in ascending order
coef <- arrange(coef,-coef.value)
# extract the top 10 and bottom 10 features
imp_coef <- rbind(head(coef,10),
                  tail(coef,10))
ggplot(imp_coef) +
  geom_bar(aes(x=reorder(coef.name,coef.value),y=coef.value),
           stat="identity") +
  ylim(-1.5,0.6) +
  coord_flip() +
  ggtitle("Coefficents in the Lasso Model") +
  theme(axis.title=element_blank())

Created new CSV file and bind with sales price column result of lasso method.

# make create submission file
preds <- predict(model_lasso,newdata=X_test)
# construct data frame for solution
solution <- data.frame(Id=as.integer(rownames(X_test)),SalePrice=preds)
write.csv(solution,"ridge_sol.csv",row.names=FALSE)
#evaluation of results
cor(preds,test_train[,230])
[1] 0.9324788
rmse(test_train[,230],preds)
[1] 0.1418151
#visualizing the results
plot(exp(preds),exp(test_train[,230]),xlab="Predicted Label",ylab="Actual Label",main="Plot of Actual Against Predicted Labels")
lin.mod=lm(exp(test_train[,230])~exp(preds))
pr.lm=predict(lin.mod)
lines(pr.lm~exp(preds), col="blue", lwd=0.5)
lines(c(0,450000), c(0,450000))
legend("topleft", legend=c("fitted line", "45 degree line"),col=c("blue", "black"), lty=1, cex=0.8)

LS0tDQp0aXRsZTogIkFkdmFuY2VkIHJlZ3Jlc3Npb24gbWV0aG9kIG9uIFByaWNlIERhdGEiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KRmlyc3QsIEluc3RhbGxpbmcgYWxsIHRoZSBwYWNrYWdlc3JlcXVpcmVkIA0KYGBge3J9DQojIGdldCB0aGUgcmVxdWlyZSBSIHBhY2thZ2VzDQojIGluc3RhbGwucGFja2FnZXMoImdncGxvdDIiKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJwbHlyIikNCiMgaW5zdGFsbC5wYWNrYWdlcygiZHBseXIiKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJjYXJldCIpDQojIGluc3RhbGwucGFja2FnZXMoIm1vbWVudHMiKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJnbG1uZXQiKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJlbGFzdGljbmV0IikNCiMgaW5zdGFsbC5wYWNrYWdlcygia25pdHIiKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJNZXRyaWNzIikNCmBgYA0KTG9hZGVkIHRoZSBwYWNrYWdlcyBpbnN0YWxsZWQgdGhyb3VnaCBSIA0KYGBge3J9DQojIGdldCB0aGUgcmVxdWlyZSBSIHBhY2thZ2VzDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KHBseXIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkobW9tZW50cykNCmxpYnJhcnkoZ2xtbmV0KQ0KbGlicmFyeShlbGFzdGljbmV0KQ0KbGlicmFyeShrbml0cikNCmxpYnJhcnkoTWV0cmljcykNCmBgYA0KYGBge3J9DQojcmVhZCBpbiBhbGwucmRzKG91dHB1dCBmcm9tIHByZS1wcm9jZXNzaW5nLlJtZCkNCnRyYWluPXJlYWRSRFMoInRyYWluLnJkcyIpDQp0ZXN0PXJlYWRSRFMoInRlc3QucmRzIikNCnByaW50KHRyYWluKQ0KcHJpbnQodGVzdCkNCmBgYA0KDQpgYGB7cn0NCiN0cmFpbjp0ZXN0IC0+IDgwJToyMCUNCnRyYWluX3RyYWluPXRyYWluWzE6KDAuOCpkaW0odHJhaW4pWzFdKSxdDQp0ZXN0X3RyYWluPXRyYWluWygwLjgqZGltKHRyYWluKVsxXSsxKTooZGltKHRyYWluKVsxXSksXQ0KcHJpbnQodHJhaW5fdHJhaW4pDQpwcmludCh0ZXN0X3RyYWluKQ0KYGBgDQoNCkRpdmlkZSBvdXQgdGhlIHRyYWluIGFuZCB0ZXN0IGRhdGFzZXQgYWNjb3JkaW5nIHRvIG91ciBpbml0aWFsIHByb3BvcnRpb24NCmBgYHtyfQ0KIyBjcmVhdGUgZGF0YSBmb3IgdHJhaW5pbmcgYW5kIHRlc3QNClhfdHJhaW4gPC0gdHJhaW5fdHJhaW5bLC0yMzBdDQpYX3Rlc3QgPC0gdGVzdF90cmFpblssLTIzMF0NCnkgPC0gdHJhaW5fdHJhaW5bLDIzMF0NCmBgYA0KDQpOb3csIG1vZGVsIG9mIHJlZ3Jlc3Npb24gaXMgYmVpbmcgc2V0IHVwLg0KYGBge3J9DQojTU9ERUwNCiMgc2V0IHVwIGNhcmV0IG1vZGVsIHRyYWluaW5nIHBhcmFtZXRlcnMNCiMgbW9kZWwgc3BlY2lmaWMgdHJhaW5pbmcgcGFyYW1ldGVyDQpDQVJFVC5UUkFJTi5DVFJMIDwtIHRyYWluQ29udHJvbChtZXRob2Q9InJlcGVhdGVkY3YiLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbnVtYmVyPTUsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICByZXBlYXRzPTUsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB2ZXJib3NlSXRlcj1GQUxTRSkNCmBgYA0KIFRlc2luZyBpdCBpbiBmZXcgbW9kZWwNCiAxKSBSaWRnZSBSZWdyZXNzaW9uIChzdWl0YWJsZSBmb3IgbXVsdGljb2xsaW5lYXJpdHkgdmFyaWFibGUvIGVhY2ggdmFyaWFibGUgaGF2ZSBoaWdoIGludGVyYWN0aW9uKQ0KIA0KYGBge3J9DQojIHRlc3Qgb3V0IFJpZGdlIHJlZ3Jlc3Npb24gbW9kZWwNCmxhbWJkYXMgPC0gc2VxKDEsMCwtMC4wMDEpDQoNCiMgdHJhaW4gbW9kZWwNCnNldC5zZWVkKDEyMykgICMgZm9yIHJlcHJvZHVjaWJpbGl0eQ0KbW9kZWxfcmlkZ2UgPC0gdHJhaW4oeD1YX3RyYWluLHk9eSwNCiAgICAgICAgICAgICAgICAgICAgIG1ldGhvZD0iZ2xtbmV0IiwNCiAgICAgICAgICAgICAgICAgICAgIG1ldHJpYz0iUk1TRSIsDQogICAgICAgICAgICAgICAgICAgICBtYXhpbWl6ZT1GQUxTRSwNCiAgICAgICAgICAgICAgICAgICAgIHRyQ29udHJvbD1DQVJFVC5UUkFJTi5DVFJMLA0KICAgICAgICAgICAgICAgICAgICAgdHVuZUdyaWQ9ZXhwYW5kLmdyaWQoYWxwaGE9MCwgIyBSaWRnZSByZWdyZXNzaW9uDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsYW1iZGE9bGFtYmRhcykpDQpgYGANCk5vdyB3ZSBjYWxjdWxhdGVkIFJNU0UgdG8gdGVzdCBvbiBIb3cgYWNjdXJhdGUgaXMgUmlkZ2UgcmVncmVzc2lvbiB3aXRoIHZhcnlpbmcgdmFsdWUgb2YgSyBvciBsYW1iZGENCiANCmBgYHtyfQ0KI1JNU0UtUm9vdCBNZWFuIFNxdWFyZSBFcnJvcg0KZ2dwbG90KGRhdGE9ZmlsdGVyKG1vZGVsX3JpZGdlJHJlc3VsdCxSTVNFPDAuMTQpKSArDQogIGdlb21fbGluZShhZXMoeD1sYW1iZGEseT1STVNFKSkNCg0KbWVhbihtb2RlbF9yaWRnZSRyZXNhbXBsZSRSTVNFKQ0KYGBgDQoyKSBVc2VkIExhc3NvIHJlZ3Jlc3Npb24gbWV0aG9kDQpgYGB7cn0NCiMgdGVzdCBvdXQgTGFzc28gcmVncmVzc2lvbiBtb2RlbA0KDQojIHRyYWluIG1vZGVsDQpzZXQuc2VlZCgxMjMpICAjIGZvciByZXByb2R1Y2liaWxpdHkNCm1vZGVsX2xhc3NvIDwtIHRyYWluKHg9WF90cmFpbix5PXksDQogICAgICAgICAgICAgICAgICAgICBtZXRob2Q9ImdsbW5ldCIsDQogICAgICAgICAgICAgICAgICAgICBtZXRyaWM9IlJNU0UiLA0KICAgICAgICAgICAgICAgICAgICAgbWF4aW1pemU9RkFMU0UsDQogICAgICAgICAgICAgICAgICAgICB0ckNvbnRyb2w9Q0FSRVQuVFJBSU4uQ1RSTCwNCiAgICAgICAgICAgICAgICAgICAgIHR1bmVHcmlkPWV4cGFuZC5ncmlkKGFscGhhPTEsICAjIExhc3NvIHJlZ3Jlc3Npb24NCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxhbWJkYT1jKDEsMC4xLDAuMDUsMC4wMSxzZXEoMC4wMDksMC4wMDEsLTAuMDAxKSwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIDAuMDAwNzUsMC4wMDA1LDAuMDAwMSkpKQ0KbW9kZWxfbGFzc28NCg0KbWVhbihtb2RlbF9sYXNzbyRyZXNhbXBsZSRSTVNFKQ0KDQpgYGANCkZyb20gcHJldmlvdXMgcHJlZGljdGlvbiwgbGFtYmRhIDAuMDA0IGdpdmUgYmVzdCBSc3F1YXJlIHNjb3JlIHdoaWNoIGV4cGxhaW4gYWxtb3N0IDkwJSBvZiB0aGUgdmFyaWFibGVzDQpXZSB1c2VkIHRoaXMgdG8gYnVpbGQgdXAgb3VyIGZpbmFsIG1vZGVsDQpgYGB7cn0NCiMgZXh0cmFjdCBjb2VmZmljaWVudHMgZm9yIHRoZSBiZXN0IHBlcmZvcm1pbmcgbW9kZWwNCmNvZWYgPC0gZGF0YS5mcmFtZShjb2VmLm5hbWUgPSBkaW1uYW1lcyhjb2VmKG1vZGVsX2xhc3NvJGZpbmFsTW9kZWwscz1tb2RlbF9sYXNzbyRiZXN0VHVuZSRsYW1iZGEpKVtbMV1dLCANCiAgICAgICAgICAgICAgICAgICBjb2VmLnZhbHVlID0gbWF0cml4KGNvZWYobW9kZWxfbGFzc28kZmluYWxNb2RlbCxzPW1vZGVsX2xhc3NvJGJlc3RUdW5lJGxhbWJkYSkpKQ0KDQojIGV4Y2x1ZGUgdGhlIChJbnRlcmNlcHQpIHRlcm0NCmNvZWYgPC0gY29lZlstMSxdDQoNCiMgcHJpbnQgc3VtbWFyeSBvZiBtb2RlbCByZXN1bHRzDQpwaWNrZWRfZmVhdHVyZXMgPC0gbnJvdyhmaWx0ZXIoY29lZixjb2VmLnZhbHVlIT0wKSkNCm5vdF9waWNrZWRfZmVhdHVyZXMgPC0gbnJvdyhmaWx0ZXIoY29lZixjb2VmLnZhbHVlPT0wKSkNCg0KY2F0KCJMYXNzbyBwaWNrZWQiLHBpY2tlZF9mZWF0dXJlcywidmFyaWFibGVzIGFuZCBlbGltaW5hdGVkIHRoZSBvdGhlciIsDQogICAgbm90X3BpY2tlZF9mZWF0dXJlcywidmFyaWFibGVzXG4iKQ0KDQpgYGANClBsb3R0aW5nIGFsbCBjb2VmZmljaWVudCByZWxhdGlvbnNoaXAgd2l0aCB0aGUgcHJpY2UgdmFyaWFibGUsIGFwcGx5aW5nIHRoZSBtb2RlbCBvbiBkYXRhIA0KYGBge3J9DQojIHNvcnQgY29lZmZpY2llbnRzIGluIGFzY2VuZGluZyBvcmRlcg0KY29lZiA8LSBhcnJhbmdlKGNvZWYsLWNvZWYudmFsdWUpDQoNCiMgZXh0cmFjdCB0aGUgdG9wIDEwIGFuZCBib3R0b20gMTAgZmVhdHVyZXMNCmltcF9jb2VmIDwtIHJiaW5kKGhlYWQoY29lZiwxMCksDQogICAgICAgICAgICAgICAgICB0YWlsKGNvZWYsMTApKQ0KDQpnZ3Bsb3QoaW1wX2NvZWYpICsNCiAgZ2VvbV9iYXIoYWVzKHg9cmVvcmRlcihjb2VmLm5hbWUsY29lZi52YWx1ZSkseT1jb2VmLnZhbHVlKSwNCiAgICAgICAgICAgc3RhdD0iaWRlbnRpdHkiKSArDQogIHlsaW0oLTEuNSwwLjYpICsNCiAgY29vcmRfZmxpcCgpICsNCiAgZ2d0aXRsZSgiQ29lZmZpY2VudHMgaW4gdGhlIExhc3NvIE1vZGVsIikgKw0KICB0aGVtZShheGlzLnRpdGxlPWVsZW1lbnRfYmxhbmsoKSkNCmBgYA0KQ3JlYXRlZCBuZXcgQ1NWIGZpbGUgYW5kIGJpbmQgd2l0aCBzYWxlcyBwcmljZSBjb2x1bW4gcmVzdWx0IG9mIGxhc3NvIG1ldGhvZC4NCmBgYHtyfQ0KIyBtYWtlIGNyZWF0ZSBzdWJtaXNzaW9uIGZpbGUNCnByZWRzIDwtIHByZWRpY3QobW9kZWxfbGFzc28sbmV3ZGF0YT1YX3Rlc3QpDQoNCiMgY29uc3RydWN0IGRhdGEgZnJhbWUgZm9yIHNvbHV0aW9uDQpzb2x1dGlvbiA8LSBkYXRhLmZyYW1lKElkPWFzLmludGVnZXIocm93bmFtZXMoWF90ZXN0KSksU2FsZVByaWNlPXByZWRzKQ0Kd3JpdGUuY3N2KHNvbHV0aW9uLCJyaWRnZV9zb2wuY3N2Iixyb3cubmFtZXM9RkFMU0UpDQpgYGANCg0KYGBge3J9DQojZXZhbHVhdGlvbiBvZiByZXN1bHRzDQpjb3IocHJlZHMsdGVzdF90cmFpblssMjMwXSkNCnJtc2UodGVzdF90cmFpblssMjMwXSxwcmVkcykNCmBgYA0KDQpgYGB7cn0NCiN2aXN1YWxpemluZyB0aGUgcmVzdWx0cw0KcGxvdChleHAocHJlZHMpLGV4cCh0ZXN0X3RyYWluWywyMzBdKSx4bGFiPSJQcmVkaWN0ZWQgTGFiZWwiLHlsYWI9IkFjdHVhbCBMYWJlbCIsbWFpbj0iUGxvdCBvZiBBY3R1YWwgQWdhaW5zdCBQcmVkaWN0ZWQgTGFiZWxzIikNCmxpbi5tb2Q9bG0oZXhwKHRlc3RfdHJhaW5bLDIzMF0pfmV4cChwcmVkcykpDQpwci5sbT1wcmVkaWN0KGxpbi5tb2QpDQpsaW5lcyhwci5sbX5leHAocHJlZHMpLCBjb2w9ImJsdWUiLCBsd2Q9MC41KQ0KbGluZXMoYygwLDQ1MDAwMCksIGMoMCw0NTAwMDApKQ0KDQpsZWdlbmQoInRvcGxlZnQiLCBsZWdlbmQ9YygiZml0dGVkIGxpbmUiLCAiNDUgZGVncmVlIGxpbmUiKSxjb2w9YygiYmx1ZSIsICJibGFjayIpLCBsdHk9MSwgY2V4PTAuOCkNCmBgYA0KDQo=